require(tidyverse)
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Data

sal_data = read.csv('C:/Users/prana/Desktop/IS-680-R/Project_Datasets/Levels_Fyi_Salary_Data.csv')
sal_data
names(sal_data)
##  [1] "timestamp"               "company"                
##  [3] "level"                   "title"                  
##  [5] "totalyearlycompensation" "location"               
##  [7] "yearsofexperience"       "yearsatcompany"         
##  [9] "tag"                     "basesalary"             
## [11] "stockgrantvalue"         "bonus"                  
## [13] "gender"                  "otherdetails"           
## [15] "cityid"                  "dmaid"                  
## [17] "rowNumber"               "Masters_Degree"         
## [19] "Bachelors_Degree"        "Doctorate_Degree"       
## [21] "Highschool"              "Some_College"           
## [23] "Race_Asian"              "Race_White"             
## [25] "Race_Two_Or_More"        "Race_Black"             
## [27] "Race_Hispanic"           "Race"                   
## [29] "Education"
ncol(sal_data)
## [1] 29

Checking for the number of columns

nrow(sal_data)
## [1] 62642

Checking for the number of rows

summary(sal_data)
##   timestamp           company             level              title          
##  Length:62642       Length:62642       Length:62642       Length:62642      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  totalyearlycompensation   location         yearsofexperience yearsatcompany  
##  Min.   :  10000         Length:62642       Min.   : 0.000    Min.   : 0.000  
##  1st Qu.: 135000         Class :character   1st Qu.: 3.000    1st Qu.: 0.000  
##  Median : 188000         Mode  :character   Median : 6.000    Median : 2.000  
##  Mean   : 216300                            Mean   : 7.204    Mean   : 2.702  
##  3rd Qu.: 264000                            3rd Qu.:10.000    3rd Qu.: 4.000  
##  Max.   :4980000                            Max.   :69.000    Max.   :69.000  
##                                                                               
##      tag              basesalary      stockgrantvalue       bonus        
##  Length:62642       Min.   :      0   Min.   :      0   Min.   :      0  
##  Class :character   1st Qu.: 108000   1st Qu.:      0   1st Qu.:   1000  
##  Mode  :character   Median : 140000   Median :  25000   Median :  14000  
##                     Mean   : 136687   Mean   :  51486   Mean   :  19335  
##                     3rd Qu.: 170000   3rd Qu.:  65000   3rd Qu.:  26000  
##                     Max.   :1659870   Max.   :2800000   Max.   :1000000  
##                                                                          
##     gender          otherdetails           cityid          dmaid      
##  Length:62642       Length:62642       Min.   :    0   Min.   :  0.0  
##  Class :character   Class :character   1st Qu.: 7369   1st Qu.:506.0  
##  Mode  :character   Mode  :character   Median : 7839   Median :807.0  
##                                        Mean   : 9856   Mean   :616.1  
##                                        3rd Qu.:11521   3rd Qu.:807.0  
##                                        Max.   :47926   Max.   :881.0  
##                                                        NA's   :2      
##    rowNumber     Masters_Degree   Bachelors_Degree Doctorate_Degree 
##  Min.   :    1   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:20069   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :42019   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   :41695   Mean   :0.2457   Mean   :0.2012   Mean   :0.02878  
##  3rd Qu.:63022   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :83875   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##                                                                     
##    Highschool        Some_College        Race_Asian       Race_White    
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.000000   Median :0.000000   Median :0.0000   Median :0.0000  
##  Mean   :0.005108   Mean   :0.005667   Mean   :0.1879   Mean   :0.1282  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.000000   Max.   :1.000000   Max.   :1.0000   Max.   :1.0000  
##                                                                         
##  Race_Two_Or_More    Race_Black      Race_Hispanic         Race          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Length:62642      
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   Class :character  
##  Median :0.00000   Median :0.00000   Median :0.00000   Mode  :character  
##  Mean   :0.01283   Mean   :0.01102   Mean   :0.01804                     
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000                     
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000                     
##                                                                          
##   Education        
##  Length:62642      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Outliers Detection and Removal

ggplot(sal_data, aes(totalyearlycompensation)) +
  geom_boxplot() +
  coord_flip()

q1 <- quantile(sal_data$totalyearlycompensation, 0.25)
q3 <- quantile(sal_data$totalyearlycompensation, 0.75)
iqr <- IQR(sal_data$totalyearlycompensation)
q1
##    25% 
## 135000
q3
##    75% 
## 264000
iqr
## [1] 129000
no_outliers <- subset(sal_data, sal_data$totalyearlycompensation > (q1 - 1.5*iqr) & sal_data$totalyearlycompensation < (q3 + 1.5*iqr))
no_outliers
nrow(sal_data) - nrow(no_outliers)
## [1] 3133
ggplot(no_outliers, aes(totalyearlycompensation)) +
  geom_boxplot() +
  coord_flip()

Now as you can see in the above box plot there are no outliers in the column.

Nulls Detection & Removal

sum(is.na(no_outliers))
## [1] 108708
no_na <- na.omit(sal_data)
no_na_ot <- na.omit(no_outliers)
no_na
no_na_ot
ggplot(sal_data, aes(totalyearlycompensation), fill = title) +
  geom_histogram() +
  facet_wrap(~title)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Looking at the histogram we can say that Software Engineer most of the data is in distributed to Software Engineer.

Top companies that offer more Compensation as per dataset

tot_sal <- sal_data %>%
            group_by(company) %>%
            summarize(tot_salry = sum(totalyearlycompensation)) %>%
            arrange(desc(tot_salry))
tot_sal

Here in the chunk above we got the companies that offer more compensation by sorting ‘totalyearlycompensation’ column.

tot_sal$company[1:10]
##  [1] "Amazon"     "Google"     "Microsoft"  "Facebook"   "Apple"     
##  [6] "Salesforce" "Uber"       "Oracle"     "LinkedIn"   "Cisco"

In the above two chunk the Top 10 companies who offer more compensation to their employees has been extracted.


lvl_exp <- sal_data %>%
            select(level,yearsofexperience,location) %>%
            mutate(most_exp = yearsofexperience > 40) %>%
            group_by(level, location) %>%
            arrange(desc(most_exp))
            
lvl_exp

Displaying the levels which have experience more than 40 years.

lvl_exp$level[1:6]
## [1] "L6"             "L6"             "SDE 5"          "Consulting MTS"
## [5] "Band 9"         "5"

Top 6 levels which have an experience of more than 40 years.

cmp_bns <- no_outliers %>%
            select(company, bonus) %>%
            group_by(company) %>%
            summarise(total_bns = sum(bonus)) %>%
            arrange(desc(total_bns))
cmp_bns

The above chunk elaborates about the companies which gave more bonus compared to other companies in descending order.

city_max <- no_outliers %>%
              group_by(location) %>%
              summarize(max_sal = max(totalyearlycompensation)) %>%
              arrange(desc(max_sal))
city_max

From the above chunk you can extract the info. like in which city you are most likely to get paid more.

unique(sal_data[c("gender")])

Exploratory Analysis

ggplot(data = sal_data,
       mapping = aes(x = totalyearlycompensation)) +
  geom_histogram(alpha = 0.5, bins = 50) +
  labs(x = "yearly_compensation", y = "number_of_employees",
       title = "Compensation based on gender") +
  facet_wrap(~gender)

ggplot(data = no_outliers,
       mapping = aes(x = totalyearlycompensation)) +
  geom_histogram(alpha = 0.5, bins = 50) +
  labs(x = "yearly_compensation", y = "number_of_employees",
       title = "Compensation based on gender") +
  facet_wrap(~gender)

As you can see the difference in the above and below histogram with the same options. The output for the second histogram which is data without any outliers in it gave us the best distribution.

ggplot(data = no_na_ot,
       mapping = aes(x = totalyearlycompensation)) +
  geom_histogram(alpha = 0.5, bins = 50) +
  labs(x = "yearly_compensation", y = "number_of_employees",
       title = "Compensation based on gender") +
  facet_wrap(~gender)

Distribution Graph acheived after outliers and Nulls Removal.

ggplot(data = no_outliers,
       mapping = aes(x = title, y = yearsofexperience)) +
  geom_point(aes(colour = gender)) +
  theme_bw() +
  coord_flip()

The plot above includes the unwanted data, nulls and outliers, where we are not able to extract the information clearly.

ggplot(data = no_na_ot,
       mapping = aes(x = title, y = yearsofexperience, fill = Masters_Degree)) +
  geom_point(aes(colour = gender)) +
  theme_bw() +
  coord_flip()

Looking at this plot we can say which gender has more experience based on title after nulls and outlier removal.

ggplot(data = no_na_ot,
       mapping = aes(x = title, y = totalyearlycompensation)) +
  geom_point(aes(colour = gender)) +
  theme_bw() +
  coord_flip()